COMP 162 Python Project¶
Introduction¶
In this project, we will be analyzing the global air quality dataset from OpenAQ to build a model that predicts the concentration of air pollutants in $\frac{\mu g}{m^{3}}$.
Methodology¶
We train a Random Forest regressor to predict pollutant concentrations (µg/m³) from spatial, temporal, and contextual features with a global air quality dataset containing 60,000 data points. We evaluate it using MAE, RMSE, and R², and validate with actual-vs-predicted plots, residual analyses, and feature-importance charts.
Objective¶
We will investigate which geographic regions have the highest concentrations of air pollutants and what factors are most important in predicting pollutant concentration.
Libraries¶
We will start off by importing the necessary packages.
import pandas as pd
import numpy as np
import pyarrow as pa
from pyarrow.csv import read_csv
import plotly.express as px
from dotenv import load_dotenv
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, accuracy_score
from math import sqrt
import matplotlib.pyplot as plt
import os
EDA¶
The dataset from OpenAQ has the following columns:
- Country Code
- City
- Location
- Coordinates
- Pollutant
- Source Name
- Value
- Last Updated
- Country Label
We now drop entries whose concentrations are recorded in parts per million (PPM) to ensure consistency, as well as entries where the concentration values are outliers or negative numbers.
df = read_csv("openaq.csv")
df = df.to_pandas()
df = df[df['Value'] >= 0].copy()
mask_ppm = df['Unit'].str.strip().str.lower() == 'ppm'
df = df.loc[~mask_ppm].copy()
df = df.drop(columns=['Unit'])
df.info()
Q1 = df['Value'].quantile(0.25)
Q3 = df['Value'].quantile(0.75)
IQR = Q3 - Q1
# define a mask for “inlier” rows
mask = (df['Value'] >= Q1 - 1.5 * IQR) & (df['Value'] <= Q3 + 1.5 * IQR)
# apply filter
df_ = df.loc[mask].copy()
print(f"Dropped {len(df) - len(df_)} rows as IQR outliers")
<class 'pandas.core.frame.DataFrame'> Index: 44013 entries, 0 to 61176 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country Code 44013 non-null object 1 City 44013 non-null object 2 Location 44013 non-null object 3 Coordinates 44013 non-null object 4 Pollutant 44013 non-null object 5 Source Name 44013 non-null object 6 Value 44013 non-null float64 7 Last Updated 44013 non-null datetime64[s, UTC] 8 Country Label 44013 non-null object dtypes: datetime64[s, UTC](1), float64(1), object(7) memory usage: 3.4+ MB Dropped 5941 rows as IQR outliers
Preprocessing¶
We will create two new columns, latitude and longitude, by extracting values from the Coordinates column. This will help us visualize the data geographically
df_[['latitude','longitude']] = (
df_['Coordinates']
.str.extract(r'([\-\d\.]+),\s*([\-\d\.]+)')
.astype(float)
)
Mapping out the data¶
We will now sample 10000 records from our dataset to create a heatmap to show the geographic spread of the air pollution
# plot pollutants by geographic location
load_dotenv()
df_sample = df_.sample(n=10000, random_state=42)
MAPBOX_TOKEN = os.getenv("MAPBOX_TOKEN")
px.set_mapbox_access_token(MAPBOX_TOKEN)
fig = px.scatter_map(
df_sample,
lat="latitude",
lon="longitude",
color="Value",
size="Value",
hover_name="City",
hover_data=["Pollutant","Value"],
map_style="carto-positron",
zoom=1,
height=600,
title="OpenAQ Measurements Around the World"
)
fig.update_layout(margin={"r":0,"t":40,"l":40,"b":0})
fig.show()
More feature extraction¶
We now extract year, month, day, and hour from the UTC timestamps to get a clearer picture of when the pollution was recorded from each node.
df_['Last Updated'] = pd.to_datetime(df_['Last Updated'], utc=True)
df_['year'] = df_['Last Updated'].dt.year
df_['month'] = df_['Last Updated'].dt.month
df_['day'] = df_['Last Updated'].dt.day
df_['hour'] = df_['Last Updated'].dt.hour
feature_cols = [
'Country Code', 'City', 'Pollutant',
'latitude','longitude',
'year','month','day','hour'
]
X = df_[feature_cols]
y = df_['Value']
Model Selection¶
We train the model on a 80/20 training-testing split and create scikit-learn pipelines for preprocessing and model fitting
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# preprocessing pipeline
numeric_feats = [
'latitude','longitude','year','month','day','hour'
]
numeric_pipe = Pipeline([
('impute', SimpleImputer(strategy='mean')),
('scale', StandardScaler())
])
categorical_feats = [
'Country Code','Pollutant', 'City'
]
categorical_pipe = Pipeline([
('impute', SimpleImputer(strategy='constant', fill_value='missing')),
('ohe', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer([
('num', numeric_pipe, numeric_feats),
('cat', categorical_pipe, categorical_feats),
])
# preprocess → randomforest regression
pipeline = Pipeline([
('prep', preprocessor),
('rf', RandomForestRegressor(n_jobs=-1))
])
# fit and evaluate
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f"MAE = {mean_absolute_error(y_test, y_pred):.4f}")
MSE = mean_squared_error(y_test, y_pred)
RMSE = sqrt(MSE)
r_squared=r2_score(y_test, y_pred)
print(f"RMSE = {RMSE}")
print("R² =", r_squared)
MAE = 8.0507 RMSE = 13.228831135400073 R² = 0.6066749573036019
Results/Findings¶
The model showed an R-Squared score of 0.6051, indicating the model explained approximately 60% of the variance in the data. Our mean absolute error (MAE) ≈ 8.0454 µg/m³, and our root mean squared error (RMSE) ≈ 13.2491 µg/m³.
Plotting¶
We will now plot Actual vs. Predicted values, Residuals vs. Predicted, the residual distribution and the feature importance.
fig = px.scatter(
x=y_test,
y=y_pred,
labels={'x':'Actual Value', 'y':'Predicted Value'},
title=f"Actual vs. Predicted Values (R² = {r_squared})"
)
# add y=x reference line
min_val = min(y_test.min(), y_pred.min())
max_val = max(y_test.max(), y_pred.max())
fig.add_shape(
type='line',
x0=min_val, y0=min_val,
x1=max_val, y1=max_val,
line=dict(color='LightSeaGreen', dash='dash')
)
fig.update_layout(
width=700, height=500,
margin=dict(l=40, r=40, t=60, b=40)
)
fig.show()
residuals = y_test.values - y_pred
fig1 = px.scatter(
x=y_pred,
y=residuals,
labels={'x':'Predicted Value','y':'Residual (True−Pred)'},
title='Residuals vs. Predicted'
)
fig1.add_shape(type='line', x0=y_pred.min(), y0=0, x1=y_pred.max(), y1=0,
line=dict(dash='dash'))
fig1.show()
# — 2. Histogram of Residuals —
fig2 = px.histogram(
residuals,
nbins=50,
labels={'value':'Residual','count':'Frequency'},
title='Distribution of Residuals'
)
fig2.show()
# 1. Extract feature names and importances
rf = pipeline.named_steps['rf']
preprocessor = pipeline.named_steps['prep']
feat_names = preprocessor.get_feature_names_out()
importances = rf.feature_importances_
# 2. Build a sorted DataFrame
imp_df = pd.DataFrame({
'feature': feat_names,
'importance': importances
}).sort_values('importance', ascending=False)
# 3. Plot top N features (e.g. top 20)
top_n = 20
fig = px.bar(
imp_df.head(top_n),
x='importance',
y='feature',
orientation='h',
title=f'Top {top_n} Feature Importances',
labels={'importance':'Importance', 'feature':'Feature'}
)
fig.update_layout(
yaxis={'categoryorder':'total ascending'},
width=800,
height=600
)
fig.show()
Conclusion¶
In summary, the Random Forest model was able to capture the major patterns in the OpenAQ data—achieving an MAE of ≈ 8.0454 µg/m³, RMSE of ≈ 13.2491 µg/m³, and R² of ≈ 0.6051—while highlighting latitude, time of day, and pollutant type as the strongest predictors. Residual and feature-importance analyses revealed heteroskedastic errors at low concentrations. A log-transform of the target, richer meteorological inputs, and cross-validated hyperparameter tuning could further improve accuracy and robustness